#!/usr/bin/env Rscript

"> three-way_per-pos_beta.R <

Reads in betas from EM-seq, WGBS and ONT Cas9 experiments, and compares them 
on a per-position basis.
" -> doc

setwd('~/csiro/stopwatch/cpgberus/16_loci_specific_three_way/')
RDNA_FASTA <- '../data/homo_sapiens.45s.fa.gz'
EMSEQ_COV <- Sys.glob('./WR*ER.hsap_45s.cov.gz')
WGBS_COV <- Sys.glob('./WR*WR.hsap_45s.cov.gz')
ONT_BED <- Sys.glob('./WR*.bed.gz')

suppressPackageStartupMessages({
  library(cowplot)
  library(eulerr)
  library(ggplot2)
  library(RColorBrewer)
})

# function to pretty-print diagnostic messages
diag_message <- function(...) {
  message('[', format(Sys.time(), "%H:%M:%S"), '] ', ...)
}

# function to annotate line of best fit in scatterplot
lm_eqn <- function(df, x, y) {
  # modified from https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
  model <- lm(as.formula(paste(y, '~', x)), df)
  eq <- substitute(
    italic(y) == c + m %.% italic(x)*','~italic(r)^2 == r2*','~italic(p) == pval,
    list(c = format(unname(coef(model)[1]), digits=3),
         m = format(unname(coef(model)[2]), digits=3),
         r2 = format(summary(model)$r.squared, digits=4),
         pval = format(summary(model)$coefficients[2,4], digits=1)))
  as.character(as.expression(eq))
}

# function to calculate sum/len of a binary string
pct_of_ones <- function(binary_string) {
  int_vector <- as.integer(unlist(strsplit(binary_string, split = "")))
  
  sum(int_vector) / length(int_vector) * 100
}

# read data
rdna_seq <- scan(RDNA_FASTA, what='character', skip=1)

# read EM-seq and WGBS data into separate dfs, but logic-wise they're fairly similar
emseq_df <- data.frame()
for (f in EMSEQ_COV) {
  sample_id <- strsplit(f, '.', fixed=TRUE)[[1]][2]
  sample_id <- gsub('/', '', sample_id, fixed=TRUE)
  
  temp_df <- read.delim(
    f, header=FALSE, 
    col.names=c('scaf', 'pos', 'pos2', 'beta', 'meth', 'unmeth'))
  temp_df$sample_id <- sample_id
  
  emseq_df <- rbind(emseq_df, temp_df)
}

wgbs_df <- data.frame()
for (f in WGBS_COV) {
  sample_id <- strsplit(f, '.', fixed=TRUE)[[1]][2]
  sample_id <- gsub('/', '', sample_id, fixed=TRUE)
  
  temp_df <- read.delim(
    f, header=FALSE, 
    col.names=c('scaf', 'pos', 'pos2', 'beta', 'meth', 'unmeth'))
  temp_df$sample_id <- sample_id
  
  wgbs_df <- rbind(wgbs_df, temp_df)
}

# tidy up EM-seq/WGBS df
# first 1 kb is promoter sequence; make sure transcription starts at +1
emseq_df$pos <- emseq_df$pos - 1000
wgbs_df$pos <- wgbs_df$pos - 1000
emseq_df$cov <- emseq_df$meth + emseq_df$unmeth
wgbs_df$cov <- wgbs_df$meth + wgbs_df$unmeth

# ONT BEDs are "bedMethyl" files
# https://www.encodeproject.org/data-standards/wgbs/
ont_df <- data.frame()
for (f in ONT_BED) {
  sample_id <- strsplit(f, '.', fixed=TRUE)[[1]][2]
  sample_id <- gsub('/', '', sample_id, fixed=TRUE)
  
  temp_df <- read.delim(
    f, header=FALSE, 
    col.names=c('scaf', 'pos0', 'pos', 'name', 'score', 'strand',
                'startcodon', 'stopcodon', 'color_rgb', 'cov', 'beta'))
  temp_df$sample_id <- sample_id
  
  ont_df <- rbind(ont_df, temp_df)
}
# tidy up ONT df
# first 11 kb is promoter sequence; make sure transcription starts at +1
ont_df$pos <- ont_df$pos - 11000

# set comparative window to [1, 13332], the transcribed 45S region. both df
# should have data in this range
emseq_df <- emseq_df[emseq_df$pos >= 1 & emseq_df$pos <= 13332, ]
wgbs_df <- wgbs_df[wgbs_df$pos >= 1 & wgbs_df$pos <= 13332, ]
ont_df <- ont_df[ont_df$pos >= 1 & ont_df$pos <= 13332, ]

# set a coverage threshold of 50 to increase confidence in beta values
diag_message('Filter `emseq_df` for positions with coverage >= 50. ',
             'Original nrow: ', nrow(emseq_df), '; ',
             'filtered nrow: ', nrow(emseq_df[emseq_df$cov >= 50, ]))
## [16:21:15] Filter `emseq_df` for positions with coverage >= 50. Original nrow: 15294; filtered nrow: 14641
emseq_df <- emseq_df[emseq_df$cov >= 50, ]

diag_message('Filter `wgbs_df` for positions with coverage >= 50. ',
             'Original nrow: ', nrow(wgbs_df), '; ',
             'filtered nrow: ', nrow(wgbs_df[wgbs_df$cov >= 50, ]))
## [16:21:15] Filter `wgbs_df` for positions with coverage >= 50. Original nrow: 15057; filtered nrow: 13672
wgbs_df <- wgbs_df[wgbs_df$cov >= 50, ]

diag_message('Filter `ont_df` for positions with coverage >= 50. ',
             'Original nrow: ', nrow(ont_df), '; ',
             'filtered nrow: ', nrow(ont_df[ont_df$cov >= 50, ]))
## [16:21:15] Filter `ont_df` for positions with coverage >= 50. Original nrow: 14928; filtered nrow: 14928
ont_df <- ont_df[ont_df$cov >= 50, ]

# calculate mean coverage for each method at this point
diag_message('Mean coverage of remaining positions are: ',
             'EM-seq ', mean(emseq_df$cov), '; ',
             'WGBS ', mean(wgbs_df$cov), '; ',
             'ONT Cas9 ', mean(ont_df$cov))
## [16:21:15] Mean coverage of remaining positions are: EM-seq 946.86182637798; WGBS 526.671006436513; ONT Cas9 608.189643622722
# only keep relevant columns
emseq_df <- emseq_df[, c('pos', 'beta', 'sample_id')]
wgbs_df <- wgbs_df[, c('pos', 'beta', 'sample_id')]
ont_df <- ont_df[, c('pos', 'beta', 'sample_id')]

# convert long-to-wide
emseq_wide_df <- reshape(emseq_df, idvar='pos', timevar='sample_id', direction='wide')
colnames(emseq_wide_df) <- gsub('^beta.', '', colnames(emseq_wide_df))
emseq_wide_df <- emseq_wide_df[order(emseq_wide_df$pos), ]

wgbs_wide_df <- reshape(wgbs_df, idvar='pos', timevar='sample_id', direction='wide')
colnames(wgbs_wide_df) <- gsub('^beta.', '', colnames(wgbs_wide_df))
wgbs_wide_df <- wgbs_wide_df[order(wgbs_wide_df$pos), ]

ont_wide_df <- reshape(ont_df, idvar='pos', timevar='sample_id', direction='wide')
colnames(ont_wide_df) <- gsub('^beta.', '', colnames(ont_wide_df))
ont_wide_df <- ont_wide_df[order(ont_wide_df$pos), ]

# check how many rows have NA beta values in ANY of the four datasets (driven
# by insufficient coverage)
diag_message('Number of incomplete rows in `emseq_wide_df`: ',
             sum(!complete.cases(emseq_wide_df)))
## [16:21:15] Number of incomplete rows in `emseq_wide_df`: 65
diag_message('Number of incomplete rows in `wgbs_wide_df`: ',
             sum(!complete.cases(wgbs_wide_df)))
## [16:21:15] Number of incomplete rows in `wgbs_wide_df`: 109
diag_message('Number of incomplete rows in `ont_wide_df`: ',
             sum(!complete.cases(ont_wide_df)))
## [16:21:15] Number of incomplete rows in `ont_wide_df`: 0
# hmm, not a lot. drop positions where methylation levels were NA in ANY of the
# four datasets
emseq_wide_df <- emseq_wide_df[complete.cases(emseq_wide_df), ]
wgbs_wide_df <- wgbs_wide_df[complete.cases(wgbs_wide_df), ]
ont_wide_df <- ont_wide_df[complete.cases(ont_wide_df), ]

# do WGBS datasets have fewer positions with meth data than EM-seq/ONT? check
# overlap of positions with methylation data across three different techniques 
pos_list <- list(`EM-seq`=emseq_wide_df$pos,
                 WGBS=wgbs_wide_df$pos,
                 `ONT Cas9`=ont_wide_df$pos)
plot(venn(pos_list))

# ... yes.

# combine the wide dfs for more diagnostic plots
wide_df <- merge(emseq_wide_df, wgbs_wide_df, by='pos', all=TRUE,
                 suffixes=c('_emseq', '_wgbs'))
wide_df <- merge(wide_df, ont_wide_df, by='pos', all=TRUE)

# select for positions that are present in all 12 datasets
wide_df <- wide_df[complete.cases(wide_df), ]
rownames(wide_df) <- wide_df$pos
wide_df <- wide_df[, -1]

# calculate per-position MEAN methylation levels for every method separately
wide_df$meanER <- rowMeans(wide_df[, grepl('ER$', colnames(wide_df))])
wide_df$meanWR <- rowMeans(wide_df[, grepl('WR$', colnames(wide_df))])
wide_df$meanO <- rowMeans(wide_df[, grepl('O$', colnames(wide_df))])
head(wide_df)
##    WR025V1ER WR025V9ER WR069V1ER WR069V9ER WR025V1WR WR025V9WR WR069V1WR
## 8   18.44603  17.27219  17.25632  17.11510  21.24736  18.87006 15.983607
## 9   16.88144  17.75188  16.67913  15.64516  15.53589  14.63940 13.067151
## 21  13.06954  12.84349  13.37719  12.09801  13.96825  12.98701 12.788260
## 22  13.12010  13.19832  11.97877  11.54472  11.60275  10.79295  8.780037
## 29  20.63397  20.46679  20.59041  20.81727  23.50381  20.00000 18.474758
## 30  20.64725  19.05099  20.63253  19.67742  18.20896  17.05171 16.558140
##    WR069V9WR WR025V1O WR025V9O WR069V1O WR069V9O   meanER   meanWR  meanO
## 8   18.95332     17.6     17.4     16.2     15.9 17.52241 18.76359 16.775
## 9   12.62799     16.3     21.1     19.7     12.4 16.73941 13.96761 17.375
## 21  12.91727     13.1     15.2     13.5     14.7 12.84706 13.16520 14.125
## 22  11.77156     18.3     19.2     17.5     13.3 12.46048 10.73683 17.075
## 29  23.66864     22.8     22.8     23.6     22.4 20.62711 21.41180 22.900
## 30  18.20303     28.5     38.3     29.2     22.9 20.00205 17.50546 29.725
# sanity check: should match the intersection of the three circles in venn diagram
nrow(wide_df)    # expected: 3336
## [1] 3336
# plot a PCA to visualise overall profile of datasets
set.seed(1337)
pca <- prcomp(t(wide_df), scale=TRUE)
pca_coords <- as.data.frame(pca$x)
eigs <- pca$sdev ^ 2

pca_df <- data.frame(PC1=pca_coords$PC1,
                     PC2=pca_coords$PC2,
                     row.names=rownames(pca_coords))
pca_df$method <- rownames(pca_coords)
pca_df$method <- gsub('^.*ER$', 'EM-seq', pca_df$method)
pca_df$method <- gsub('^.*WR$', 'WGBS', pca_df$method)
pca_df$method <- gsub('^.*O$', 'ONT Cas9', pca_df$method)
pca_df$sample <- gsub('ER$|WR$|O$', '', rownames(pca_df))

g <- ggplot(pca_df, aes(x=PC1, y=PC2, color=method, fill=method, shape=sample)) +
  geom_point(size=3, alpha=0.5) +
  scale_shape_manual(values=21:25) +
  xlab(paste0('PC1 (', round(eigs[1] / sum(eigs) * 100, 2), '%)')) +
  ylab(paste0('PC2 (', round(eigs[2] / sum(eigs) * 100, 2), '%)')) +
  theme_minimal(12) +
  theme(legend.position='top', legend.box='vertical', legend.margin=margin())
print(g)

ggsave('three-way.pca.pdf', width=6, height=6)

# PCA indicates variation is largest across methods, much less variation across
# samples. which is why per-method means were computed. subsequent plots are
# based off these mean values
# plot EM-seq vs. WGBS
ggplot(wide_df, aes(x=meanER, y=meanWR)) +
  geom_point(alpha=0.2) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
  ggtitle('Per-position methylation levels, EM-seq vs. WGBS') +
  xlab('EM-seq') +
  ylab('WGBS') +
  theme_minimal(12)

# plot EM-seq vs. ONT
ggplot(wide_df, aes(x=meanER, y=meanO)) +
  geom_point(alpha=0.2) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanO'), parse=TRUE) +
  ggtitle('Per-position methylation levels, EM-seq vs. ONT Cas9') +
  xlab('EM-seq') +
  ylab('ONT Cas9') +
  theme_minimal(12)

# WGBS vs. ONT
ggplot(wide_df, aes(x=meanWR, y=meanO)) +
  geom_point(alpha=0.2) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanWR', 'meanO'), parse=TRUE) +
  ggtitle('Per-position methylation levels, WGBS vs. ONT Cas9') +
  xlab('WGBS') +
  ylab('ONT Cas9') +
  theme_minimal(12)

# create a long df for plotting
long_df <- wide_df[, c('meanER', 'meanWR', 'meanO')]
long_df$pos <- as.numeric(rownames(long_df))
long_df <- reshape(long_df, direction='long',
                   idvar='pos',
                   varying=c('meanER', 'meanWR', 'meanO'), v.name='beta',
                   timevar='method', times=c('EM-seq', 'WGBS', 'ONT Cas9'))

# sort by pos, reset row numbering
long_df <- long_df[order(long_df$pos), ]
rownames(long_df) <- NULL
head(long_df)
##   pos   method     beta
## 1   8   EM-seq 17.52241
## 2   8     WGBS 18.76359
## 3   8 ONT Cas9 16.77500
## 4   9   EM-seq 16.73941
## 5   9     WGBS 13.96761
## 6   9 ONT Cas9 17.37500
# calculate GC% in a window of WINDOW_BP
WINDOW_BP <- 101  # this value should be odd, so that the base at midpoint
                  # is sandwiched by equal num of bases upstream and downstream
bp_per_side <- (WINDOW_BP - 1) / 2

# create a string where C/G == 1 and A/T == 0, then
#   GC% = sum(window) / len(window)
rdna_binary <- gsub('C', '1', rdna_seq, ignore.case=TRUE)
rdna_binary <- gsub('G', '1', rdna_binary, ignore.case=TRUE)
rdna_binary <- gsub('[^1]', '0', rdna_binary, ignore.case=TRUE)
gcpct_df <- data.frame(pos=(bp_per_side + 1):(nchar(rdna_seq) - bp_per_side),
                       gcpct=0)
gcpct_df$gcpct <- unlist(lapply(
  gcpct_df$pos, 
  function(x) pct_of_ones(substr(rdna_binary, x - bp_per_side, x + bp_per_side))))

# first 1 kb is promoter sequence; make sure transcription starts at +1
gcpct_df$pos <- gcpct_df$pos - 1000
# plot beta across loci
g1 <- ggplot(long_df, aes(x=pos, y=beta, color=method)) +
  geom_point(alpha=0.1) +
  geom_smooth(method='loess', span=0.1) +
  scale_color_manual(values=c('#1b9e77','#d95f02','#7570b3')) +
  coord_cartesian(xlim=c(0, 13332)) +
  ggtitle('Methylation levels across 45S loci') +
  xlab('Position') +
  ylab('Methylation level (%)') +
  theme_minimal(12) +
  theme(legend.position='top',
        axis.title.x=element_blank(),
        axis.text.x=element_blank())

g2 <- ggplot(gcpct_df, aes(x=pos, y=gcpct)) +
  geom_smooth(method='loess', span=0.1) +
  coord_cartesian(xlim=c(0, 13332)) +
  xlab('Position') +
  ylab('GC%') +
  theme_minimal(12)

plot_grid(g1, g2, ncol=1, rel_heights=c(0.8, 0.2))
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# does GC% influence the discrepancies in beta? re-do scatterplot, but colour
# points by GC%
# 
# merge dataframes together (inner-join), reset row names
wide_df <- merge(wide_df, gcpct_df, by.x=0, by.y='pos', all=FALSE)
colnames(wide_df)[1] <- 'pos'
wide_df$pos <- as.numeric(wide_df$pos)
wide_df <- wide_df[order(wide_df$pos), ]
rownames(wide_df) <- NULL
head(wide_df)
##   pos WR025V1ER WR025V9ER WR069V1ER WR069V9ER WR025V1WR WR025V9WR WR069V1WR
## 1   8  18.44603  17.27219  17.25632  17.11510  21.24736  18.87006 15.983607
## 2   9  16.88144  17.75188  16.67913  15.64516  15.53589  14.63940 13.067151
## 3  21  13.06954  12.84349  13.37719  12.09801  13.96825  12.98701 12.788260
## 4  22  13.12010  13.19832  11.97877  11.54472  11.60275  10.79295  8.780037
## 5  29  20.63397  20.46679  20.59041  20.81727  23.50381  20.00000 18.474758
## 6  30  20.64725  19.05099  20.63253  19.67742  18.20896  17.05171 16.558140
##   WR069V9WR WR025V1O WR025V9O WR069V1O WR069V9O   meanER   meanWR  meanO
## 1  18.95332     17.6     17.4     16.2     15.9 17.52241 18.76359 16.775
## 2  12.62799     16.3     21.1     19.7     12.4 16.73941 13.96761 17.375
## 3  12.91727     13.1     15.2     13.5     14.7 12.84706 13.16520 14.125
## 4  11.77156     18.3     19.2     17.5     13.3 12.46048 10.73683 17.075
## 5  23.66864     22.8     22.8     23.6     22.4 20.62711 21.41180 22.900
## 6  18.20303     28.5     38.3     29.2     22.9 20.00205 17.50546 29.725
##      gcpct
## 1 62.37624
## 2 63.36634
## 3 68.31683
## 4 67.32673
## 5 67.32673
## 6 66.33663
# plot EM-seq vs. WGBS and colour points by GC%
g3 <- ggplot(wide_df, aes(x=meanER, y=meanWR, color=gcpct)) +
  geom_point(alpha=0.4) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(55, 95),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 55), ylim=c(0, 55)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
  ggtitle('Per-position methylation levels') +
  xlab('EM-seq (%)') +
  ylab('WGBS (%)') +
  theme_minimal(12) +
  theme(legend.position=c(0.75, 0.1))

# plot EM-seq vs. ONT Cas9 and colour points by GC%
g4 <- ggplot(wide_df, aes(x=meanER, y=meanO, color=gcpct)) +
  geom_point(alpha=0.4) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(55, 95),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 55), ylim=c(0, 55)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanO'), parse=TRUE) +
  ggtitle('') +
  xlab('EM-seq (%)') +
  ylab('ONT Cas9 (%)') +
  theme_minimal() +
  theme(legend.position=c(0.75, 0.1))

# plot WGBS vs. ONT Cas9 and colour points by GC%
g5 <- ggplot(wide_df, aes(x=meanWR, y=meanO, color=gcpct)) +
  geom_point(alpha=0.4) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(55, 95),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 55), ylim=c(0, 55)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanWR', 'meanO'), parse=TRUE) +
  ggtitle('') +
  xlab('WGBS (%)') +
  ylab('ONT Cas9 (%)') +
  theme_minimal() +
  theme(legend.position=c(0.75, 0.1))

# do, like, proper stats
wide_df$delta_wgbs_emseq <- wide_df$meanWR - wide_df$meanER
summary(lm(wide_df$delta_wgbs_emseq ~ wide_df$gcpct))
## 
## Call:
## lm(formula = wide_df$delta_wgbs_emseq ~ wide_df$gcpct)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.6962  -2.8369  -0.4385   2.4450  20.8407 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -15.588541   0.608575  -25.61   <2e-16 ***
## wide_df$gcpct   0.225857   0.007925   28.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.959 on 3327 degrees of freedom
## Multiple R-squared:  0.1962, Adjusted R-squared:  0.196 
## F-statistic: 812.2 on 1 and 3327 DF,  p-value: < 2.2e-16
g6 <- ggplot(wide_df, aes(x=gcpct, y=delta_wgbs_emseq)) +
  geom_point(alpha=0.2) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  coord_cartesian(xlim=c(40, 95), ylim=c(-20, 40)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'gcpct', 'delta_wgbs_emseq'), parse=TRUE) +
  ggtitle('Per-position residual methylation levels vs. GC%') +
  xlab('GC%') +
  ylab('Residual WGBS - EM-seq (%)') +
  theme_minimal(12)

wide_df$delta_ont_emseq <- wide_df$meanO - wide_df$meanER
g7 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_emseq)) +
  geom_point(alpha=0.2) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  coord_cartesian(xlim=c(40, 95), ylim=c(-20, 40)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'gcpct', 'delta_ont_emseq'), parse=TRUE) +
  ggtitle('') +
  xlab('GC%') +
  ylab('Residual ONT Cas9 - EM-seq (%)') +
  theme_minimal(12)

wide_df$delta_ont_wgbs <- wide_df$meanO - wide_df$meanWR
g8 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_wgbs)) +
  geom_point(alpha=0.2) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  coord_cartesian(xlim=c(40, 95), ylim=c(-20, 40)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'gcpct', 'delta_ont_wgbs'), parse=TRUE) +
  ggtitle('') +
  xlab('GC%') +
  ylab('Residual ONT Cas9 - WGBS (%)') +
  theme_minimal(12)
plot_grid(g3, g6, g4, g7, g5, g8, ncol=2, rel_heights=c(1, 1, 1),
          labels=c('A', 'B', 'C', 'D', 'E', 'F'))

ggsave('three-way.beta_and_gc.pdf', width=10, height=12)

# list deps used in this script
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux bookworm/sid
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blis-openmp/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] RColorBrewer_1.1-2 ggplot2_3.3.5      eulerr_6.1.1       cowplot_1.1.1     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.8       bslib_0.3.1      compiler_4.1.2   pillar_1.6.4    
##  [5] jquerylib_0.1.4  highr_0.9        tools_4.1.2      digest_0.6.29   
##  [9] lattice_0.20-45  nlme_3.1-155     jsonlite_1.7.3   evaluate_0.14   
## [13] lifecycle_1.0.1  tibble_3.1.6     gtable_0.3.0     mgcv_1.8-38     
## [17] pkgconfig_2.0.3  rlang_0.4.12     Matrix_1.4-0     DBI_1.1.2       
## [21] yaml_2.2.1       xfun_0.29        fastmap_1.1.0    withr_2.4.3     
## [25] stringr_1.4.0    dplyr_1.0.7      knitr_1.37       generics_0.1.1  
## [29] sass_0.4.0       vctrs_0.3.8      tidyselect_1.1.1 grid_4.1.2      
## [33] glue_1.6.0       R6_2.5.1         fansi_1.0.2      rmarkdown_2.11  
## [37] polyclip_1.10-0  farver_2.1.0     polylabelr_0.2.0 purrr_0.3.4     
## [41] magrittr_2.0.1   splines_4.1.2    scales_1.1.1     htmltools_0.5.2 
## [45] ellipsis_0.3.2   assertthat_0.2.1 colorspace_2.0-2 labeling_0.4.2  
## [49] utf8_1.2.2       stringi_1.7.6    munsell_0.5.0    crayon_1.4.2